from pyspark import SparkConf, SparkContext
import os
os.environ['PYSPARK_PYTHON'] = "C:/Users/Lenovo/AppData/Local/Programs/Python/Python310/python.exe"

conf = SparkConf().setMaster("local[*]").setAppName("test_spark")
sc = SparkContext(conf=conf)

rdd = sc.textFile("E:/excel学习文档-表格/word.txt")
# print(rdd.collect())

# 提取所有单词
word_rdd = rdd.flatMap(lambda x: x.split(" "))
# print(rdd1.collect())

# 将所有单词转换为二元元组，单词为key，value设为1
word_t = word_rdd.map(lambda x: (x, 1))
# print(word_t.collect())

# 分组求和
word_group = word_t.reduceByKey(lambda x, y: x + y)
print(word_group.collect())